import pandas as pd
import numpy as np
import plotly
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib.pyplot import cm
from matplotlib.gridspec import GridSpec
import plotly.figure_factory as ff
from plotly.figure_factory import create_distplot
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from scipy.cluster import hierarchy
from scipy.cluster.hierarchy import dendrogram, linkage, set_link_color_palette
from scipy.stats import norm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, silhouette_score
import sys
import warnings
if not sys.warnoptions:
warnings.simplefilter('ignore')
This analysis explores the features associated with customers of a gym chain.
The dataset is compiled from customer records. The chain is developing a customer interaction strategy based on this data. The goal is to devise a customer retention strategy based on an analysis of customers who churn and those who stay.
Classification models predicting churn are trained herein with sklearn. Then, the K-means algorithm locates clusters for further analysis.
gym = pd.read_csv('gym_churn_us.csv')
gym.shape
gym.info(memory_usage='deep')
#doublecheck for null values
gym[gym.isna().any(axis=1)]
#check for duplicates
gym[gym.duplicated()]
gym.sample(3)
gym.columns = ['gen_id','near','partnr','friend','phone','contract','group','age','xtra_charges_tot','contract_left','lifetime','avg_freq_total','avg_freq_curr_mon','churn']
gym.xtra_charges_tot = gym.xtra_charges_tot.round(2)
gym.sample()
gym['contract_left'] = gym.contract_left.astype(int)
gym.describe()
#how many zeroes are in each column
for i in gym.columns: print(i, len(gym[gym[i]==0]))
There are a lot of zero values in this dataset. But I do not think that indicates that features are missing. Here are the reasons for the zeroes. In lifetime value, zero lifetime indicates that the person has been a member of the gym for less than one month. For gender, zero indicates that the dataset creators couldn't think of a better value for non-males. For the near location, zero indicates that the person does not live close by. For phone, zero indicates the person did not supply a phone number, which accounts for about one-tenth of the dataset. For partner, although zeroes proliferate, nearly half of all people are indeed part of partner companies. More than two-thirds did not sign up with a friend's promo code. Sixty percent do not take part in group sessions. For average visits per week compared to last month, 4.5% haven't come to the gym. These zeroes do not reflect aberrant gym customer behavior; they reflect ordinary gym customer behavior.
gymchurn = gym[gym['churn'] == 1]
gymstay = gym[gym['churn'] == 0]
print(gymchurn.shape)
gymstay.shape
colorb = ['skyblue','tomato','deepskyblue','#900C3F']
def distrib_plot(df, feature):
plt.figure(figsize=(15,15))
area = GridSpec(5, 3)
for i, column in enumerate(df.drop(feature, axis = 1).columns):
plt.subplot(area[i//3, i%3], title=column.replace('_',' '))
values = len(df[column].unique())
features = sorted(df[feature].unique())
if values > 12: #continuous
for i,x in enumerate(features):
sns.distplot(df[df[feature] == x][column], hist = False, kde_kws={
'shade': True, 'linewidth': 1}, color=colorb[i])
else: #discrete
sns.countplot(column, hue=feature, data=gym, palette=colorb, alpha=.5, saturation=.8)
plt.gca().get_legend().remove()
if values == 2:
if column == 'gen_id':
plt.xticks(np.arange(values),('F','M'))
if feature == 'churn':
legend = ['stay','churn']
else:
legend = features
plt.legend(legend, shadow=True, fancybox=True, title= feature, loc='best')
else:
plt.xticks(np.arange(values),('no','yes'))
else:
plt.xticks(np.arange(values),[int(x) for x in sorted(df[column].unique())])
plt.xlabel('')
plt.ylabel('')
plt.tight_layout()
plt.suptitle('Stay v Churn: Feature Distribution', fontsize=20, y=1.02)
plt.show()
distrib_plot(gym,'churn')
gymchurnfeat=gymchurn.drop('churn',axis=1)
gymstayfeat=gymstay.drop('churn',axis=1)
avggymchurnfeat=gymchurnfeat.aggregate('mean')
print("Averages for those who churn:")
avggymchurnfeat
avggymstayfeat=gymstayfeat.aggregate('mean')
print("Averages for those who stay:")
print(avggymstayfeat)
print("The difference between those who stay/churn averages:")
print(avggymstayfeat-avggymchurnfeat)
There are key differences between those who churn and those who stay. The gender of those who stay is on average the feature that is nearly the same for both groups. Every other feature has a greater difference in averages.
So those who stay live/work closer, are more likely to be employed by a partner company and have a greater portion coming in on the friend promotion and with a group. They are more likely to provide a phone number.
They are older. On average, they spend more on extra charges. They opted for longer contracts and have longer customer lifetimes, as well as coming into the gym more frequently on average.
gymchurnfeat_b=gymchurnfeat.drop(['contract','age',
'xtra_charges_tot','contract_left',
'lifetime','avg_freq_total','avg_freq_curr_mon'],axis=1)
gymchurnfeat_v=gymchurnfeat.drop(['gen_id','near','partnr','friend','phone','group'],axis=1)
gymstayfeat_b=gymstayfeat.drop(['contract','age',
'xtra_charges_tot','contract_left',
'lifetime','avg_freq_total','avg_freq_curr_mon'],axis=1)
gymstayfeat_v=gymstayfeat.drop(['gen_id','near','partnr','friend','phone','group'],axis=1)
gymchurnfeat_b.columns
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(10, 7))
cols=['#ffa494','#fa2600']
for i, column in enumerate(gymchurnfeat_b.columns):
chart=sns.countplot(gymchurnfeat_b[column],ax=axes[i//3,i%3],palette=cols,saturation=.2)
chart.set_ylabel('')
fig.suptitle('Churn Customer Distribution: Gender,Near,Partner,Friend,Phone,Group', fontsize=18);
plt.show()
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(10, 7))
cols=['tomato','#81d4fa']
for i, column in enumerate(gymchurnfeat_b.columns):
chart=sns.countplot(gymchurnfeat_b[column],ax=axes[i//3,i%3],palette=cols,saturation=.3)
chart.set_ylabel('')
for i, column in enumerate(gymstayfeat_b.columns):
chart2=sns.countplot(gymstayfeat_b[column],ax=axes[i//3,i%3],palette=cols,saturation=.3)
chart2.set_ylabel('')
fig.suptitle('Churn v Stay Distribution: Discrete Features', fontsize=18);
plt.show()
Friend and gender features are fairly balanced for both churn/stay. The group and partner features have a higher portion of those who stay than those who churn. The providing phone number and living/working nearby features have a big difference between those who stay and those who churn: those who stay are much more likely to provide a phone number and live/work nearby.
colors=['#005EA0','#0080BD','rosybrown','tomato','#81d4fa','#00BBE3']
fig = ff.create_distplot(
[gymchurnfeat_b[
c] for c in gymchurnfeat_b.columns], gymchurnfeat_b.columns,show_rug=False,bin_size=.18,colors=colors)
fig.update_layout(title='Churn: Interactive Distribution of Gender,Near,Partner,Fried,Phone,Group Features',
plot_bgcolor='rgb(255,253,228)')
fig.update_xaxes(title_text=" 0=no group, no phone, no friend, no partner, not near, not male 1=group visit, phone, friend promo, partner company, nearby, male")
fig.show()
colors1=['#81d4fa','tomato','palegoldenrod','#FFDF00','#81d4fa','orange']
fig = create_distplot(
[gymstayfeat_b[
c] for c in gymstayfeat_b.columns], gymstayfeat_b.columns,show_rug=False,bin_size=.18,colors=colors1)
fig.update_layout(title='Stay: Interactive Distribution of Gender,Near,Partner,Fried,Phone,Group Features',
plot_bgcolor='rgb(255,253,228)')
fig.update_xaxes(title_text=" 0=no group, no phone, no friend, no partner, not near, not male 1=group visit, phone, friend promo, partner company, nearby, male")
fig.show()
sns.distplot(gymchurnfeat_v['xtra_charges_tot'],color='tomato',bins=50)
sns.distplot(gymstayfeat_v['xtra_charges_tot'],color='#00BBE3',bins=50)
plt.title("Distribution of Total Extra Charges: Churn v Stay")
plt.xlabel("Dollar amount of total extra charges")
gymchurnfeat_time=gymchurnfeat_v.drop(['xtra_charges_tot'],axis=1)
gymchurnfeat_contractlife=gymchurnfeat_time.drop(['age','avg_freq_total','avg_freq_curr_mon'],axis=1)
gymstayfeat_time=gymstayfeat_v.drop(['xtra_charges_tot'],axis=1)
gymstayfeat_contractlife=gymstayfeat_time.drop(['age','avg_freq_total','avg_freq_curr_mon'],axis=1)
colors=['#005EA0','#81d4fa','tomato']
fig = create_distplot(
[gymchurnfeat_contractlife[
c] for c in gymchurnfeat_contractlife.columns],
gymchurnfeat_contractlife.columns,show_rug=False,bin_size=2,colors=colors)
fig.update_layout(title='Churn: Interactive Distribution of Lifetime, Contract Length & Contract Time Left Features',
plot_bgcolor='rgb(252, 250, 232)')
fig.update_xaxes(title_text="Months")
fig.show()
colors=['darkorange','gold','#00BBE3']
fig = create_distplot(
[gymstayfeat_contractlife[
c] for c in gymstayfeat_contractlife.columns],
gymstayfeat_contractlife.columns,show_rug=False,bin_size=2,colors=colors)
fig.update_layout(title='Stay: Interactive Distribution of Lifetime, Contract Length & Contract Time Left Features',
plot_bgcolor='rgb(252, 250, 232)')
fig.update_xaxes(title_text="Months")
fig.show()
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(20, 10))
fig.suptitle('Churn v Stay: Distribution Plots for Features related to Time', fontsize=18);
for i, column in enumerate(gymchurnfeat_time.columns):
sns.distplot(gymchurnfeat_time[column],ax=axes[i//3,i%3],color='tomato',kde=False)
for i, column in enumerate(gymstayfeat_time.columns):
sns.distplot(gymstayfeat_time[column],ax=axes[i//3,i%3],color='#81d4fa',kde=False)
axes[0, 0].set_title("Contract Months")
axes[0, 1].set_title("Age")
axes[0, 2].set_title("Contract Months Left")
axes[1, 0].set_title("Lifetime")
axes[1, 1].set_title("Avg Frequency of Visits for Lifetime")
axes[1, 2].set_title("Avg Frequency of Visits compared to prior month")
fig.subplots_adjust(top=0.88)
plt.show();
Average frequency of visits over customer lifetime is a steep but normal distribution, with those who churn visiting less so their probability density curve peaks higher than that for those who stay and lies to its left. Average frequency of visits compared to the prior month is a normal distribution for those who stay but more right skewed for those who churn with a sharp spike for the fewest visits, then a fairly normal distributed probability density that lies to the left of that for those who stay.
gymstayfeat_contract=gymstayfeat_contractlife.drop(['lifetime'],axis=1)
gymstayfeat_life=gymstayfeat_contractlife.drop(['contract','contract_left'],axis=1)
gymstayfeat_life.columns
gymstayfeat_life.rename(columns = {'lifetime':'lifestay'}, inplace = True)
gymstayfeat_life.columns
gymcor = gym.corr()
fig, ax = plt.subplots(figsize=(17,13))
sns.heatmap(gymcor, annot = True, cmap='coolwarm', ax=ax,linewidth=1,linecolor='tomato')
fig.suptitle('Correlation Heat Map for Features and Churn',fontsize=18);
As far as churn goes, the time and money features are the features most negatively correlated with churn.
The most striking correlations in this dataset are between two pairs of features. This heatmap is very important because the high correlations in it point to two sets of features that are nearly entirely duplicative.
Although the contract period is meant to show how much time the customer signed up for, it is .97 correlated, nearly identical, with the amount of time remaining until the contract ends. So it appears that this dataset nearly entirely captures customers at the outset of their membership. One or the other of these features must not be calculated into the analysis and modelling because their nearly identical nature will very much warp the results.
The second set of highly-correlated features are the frequency with which the customers come. The figure for visits per week over the customer lifetime is .95 correlated with the figure for visits per week over the preceding month. This can likely be explained by what is clear from the paragraph above: nearly all of the customers in this dataset are at the outset of their membership. So, for new customers, the visits over the customer lifetime and the visits compared to a month prior is exactly the same figure. Therefore, one of these two features must not be part of the analysis and modelling because they are nearly identical and both together would warp the results.
Official definitions provided for these features:
'Contract_period' — 1 month, 3 months, 6 months, or 1 year
'Month_to_end_contract' — the months remaining until the contract expires'Avg_class_frequency_total' — average frequency of visits per week over the customer's lifetime
'Avg_class_frequency_current_month' — average frequency of visits per week over the preceding month#Paring down the dataset to avert distaster from highly correlated features
slimgym = gym.drop(['contract_left','avg_freq_curr_mon'],axis=1)
slimgym.head(1)
slimgym.describe()
slimgymcor=slimgym.corr()
fig, ax = plt.subplots(figsize=(16,12))
sns.heatmap(slimgymcor, annot = True, cmap='coolwarm', ax=ax,linewidth=1,linecolor='tomato',robust=True)
fig.suptitle('Correlation Heat Map for Features and Churn, Slimmed Dataset',fontsize=18);
X = slimgym.drop('churn', axis=1)
y = slimgym['churn']
#Divide the data into train and validation sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size= 0.2,random_state=0)
# create a StandardScaler object and apply it to the train set
scaler = StandardScaler()
X_train_st = scaler.fit_transform(X_train) # train the scaler and transform the matrix for the train set
print(X_train_st[:5])
# apply standardization to the feature matrix for the test set
X_test_st = scaler.transform(X_test)
#define algorithm
model = LogisticRegression(random_state=0)
#train model
model.fit(X_train_st, y_train)
# use the trained model to make forecasts
#binary prediction
predictions = model.predict(X_test_st)
probabilities = model.predict_proba(X_test_st)[:,1]
#function printing metrics
def print_all_metrics(y_true, y_pred, y_proba, title = 'Classification metrics'):
print(title)
print('\tAccuracy: {:.2f}'.format(accuracy_score(y_true, y_pred)))
print('\tPrecision: {:.2f}'.format(precision_score(y_true, y_pred)))
print('\tRecall: {:.2f}'.format(recall_score(y_true, y_pred)))
print('\tF1: {:.2f}'.format(f1_score(y_true, y_pred)))
print('\tROC_AUC: {:.2f}'.format(roc_auc_score(y_true, y_proba)))
print_all_metrics(y_test, predictions, probabilities, title = 'Metrics for Logistic Regression:')
# define algorithm for random forest model
rf_model = RandomForestClassifier(n_estimators = 100, random_state = 0)
# train random forest model
rf_model.fit(X_train_st, y_train)
# use the trained model to make predictions
rf_predictions = rf_model.predict(X_test_st)
rf_probabilities = rf_model.predict_proba(X_test_st)[:,1]
# print all metrics
print_all_metrics(y_test, rf_predictions, rf_probabilities, title = 'Metrics for Random Forest:')
sc = StandardScaler()
X_sc = sc.fit_transform(X)
linked = linkage(X_sc, method = 'ward')
plt.figure(figsize=(15, 10))
set_link_color_palette(['#00BBE3','sienna','tomato','#00BBE3'])
dendrogram(linked, orientation='top',show_leaf_counts=True,above_threshold_color="rosybrown")
plt.title('Hierarchical clustering for slimGym features')
plt.show()
If you examine the above dendogram, you can differentiate 5 clusters that branch apart with a high enough respectable threshold. (The right-hand branch splits into two main sub-branches.)
So my primary analysis is for 5 clusters.
# define k_means model with 4 clusters
km = KMeans(n_clusters = 4, random_state=0)
# predict clusters for observations (algorithm assigns them a number from 0 to 3)
labels = km.fit_predict(X)
slimgym['km']=labels
slimgym.info()
#define k_means model with 5 clusters
km5 = KMeans(n_clusters = 5, random_state=0)
# predict clusters for observations (algorithm assigns them a number from 0 to 4)
labels5 = km5.fit_predict(X)
slimgym['km5']=labels5
print('Silhouette_score for 5 clusters: {:.2f}'.format(silhouette_score(X, labels5)))
print('Silhouette_score for 4 clusters: {:.2f}'.format(silhouette_score(X, labels)))
# mean feature values for each cluster, 5 clusters
slimgym.drop('km',axis=1).groupby('km5').mean()
# mean feature values for each cluster, 4 clusters
slimgym.drop('km5',axis=1).groupby('km').mean()
kmfeatures=slimgym.drop('km5',axis=1).groupby(['km','churn']).mean()
plotkmfeat=kmfeatures.drop(['xtra_charges_tot','age'],axis=1)
kmfeatures
km5features=slimgym.drop('km',axis=1).groupby(['km5','churn']).mean()
plotkm5feat=km5features.drop(['xtra_charges_tot','age'],axis=1)
km5features
In order to visualize features by cluster, I compared mean feature values in visualizations in Tableau.
The two dashboards I made can be viewed here:
https://public.tableau.com/profile/lcodyb
The analysis I wrote below about the 5 clusters reflects the visual analysis in Tableau called Forecast 5km.
These are distributions by features in Seaborn plots.
colorb = ['skyblue','deepskyblue','tomato','#FFB61B','#FFFF33']
def cluster_plot(df, feature):
plt.figure(figsize=(15,15))
area = GridSpec(5, 3)
for i, column in enumerate(df.drop(feature, axis = 1).columns):
plt.subplot(area[i//3, i%3], title=column.replace('_',' '))
values = len(df[column].unique())
features = sorted(df[feature].unique())
if values > 12: #continuous
for i,x in enumerate(features):
sns.distplot(df[df[feature] == x][column], hist = False, kde_kws={
'shade': True, 'linewidth': 1}, color=colorb[i])
else: #discrete
sns.countplot(column, hue=feature, data=slimgym, palette=colorb, alpha=.5, saturation=.9)
plt.gca().get_legend().remove()
if values == 2:
if column == 'gen_id':
plt.xticks(np.arange(values),('F','M'))
if feature == 'km5':
legend = ['0','1','2','3','4']
else:
legend = features
plt.legend(legend, shadow=True, fancybox=True, title= feature)
else:
plt.xticks(np.arange(values),('no','yes'))
else:
plt.xticks(np.arange(values),[int(x) for x in sorted(df[column].unique())])
plt.xlabel('')
plt.ylabel('')
plt.tight_layout()
plt.suptitle('Cluster Feature Distribution', fontsize=20, y=1.01)
plt.show()
cluster_plot(slimgym,'km5')
#split df by cluster
slimgym_km0 = slimgym[slimgym['km5'] == 0].drop(['km5','churn'],axis=1)
slimgym_km1 = slimgym[slimgym['km5'] == 1].drop(['km5','churn'],axis=1)
slimgym_km2 = slimgym[slimgym['km5'] == 2].drop(['km5','churn'],axis=1)
slimgym_km3 = slimgym[slimgym['km5'] == 3].drop(['km5','churn'],axis=1)
slimgym_km4 = slimgym[slimgym['km5'] == 4].drop(['km5','churn'],axis=1)
As far as age distribution goes, one can only wonder why nearly everyone who goes to this gym is younger than 35 years old and in a very narrow band of ages around 30. In the last available data, the average age of US health club members was 39 years old, and a full quarter of the 41 million health club members in the US are older than 55. This gym’s promotions, groups, partners are limited to one very rarified group!
All clusters have a similar distribution for primarily nearby gym members and providing a phone number. For all clusters, there is a greater propensity for not having the friend promotion and for not coming as a group. Working for a partner company is a more equal bimodal distribution, with cluster 1 showing somewhat less who work for a partner company than those who don’t.
The non-overlapping distribution of extra charges shows how diverse the clusters are for this critical feature. Essentially the clusters rank themselves from best to worst in this feature, starting with 0 spending the most, then 4, 2, 1 and finally 3 spending very little (one-tenth the amount of cluster 0!)
There are many interesting differences between mean cluster features to observe.
Cluster 0 has the most anomalous feature values.Interesting differences for other clusters include:
km0customers=slimgym_km0.shape[0]
km1customers=slimgym_km1.shape[0]
km2customers=slimgym_km2.shape[0]
km3customers=slimgym_km3.shape[0]
km4customers=slimgym_km4.shape[0]
customerlist=[km0customers,km1customers,km2customers,km3customers,km4customers]
#number of customers per cluster
count=0
for i in customerlist:
print(i,"customers in cluster",count)
count=count+1
# churn for each cluster, 5
churnsum=slimgym.groupby('km5').agg({'churn':'sum'})
churnsum.churn
#calculating churn rate
churnsum.churn/customerlist
Above is the churn for each cluster. They differ, dramatically. Clusters 0 and 4 are most ready to churn. Cluster 3 is most loyal.
That information is even more interesting when you pair it with the analysis of the mean feature table above. Look at all the unique values for Cluster 0. The Cluster 0 customers who churn nonetheless provide their phone number proportionally more often than other clusters and those who churn used the friend promotion more on avearge than any other segment. Those who churn live or work nearby on average more than other clusters and had the longest contracts on average of any cluster segements that churn.
The most fascinating detail is comparing the cluster most likely to churn with the cluster that is extremely loyal by comparison when it comes to extra charges. Cluster 0 spent 10 times as much on extra charges as Cluster 3. Yet Cluster 0 churned the most and Cluster 3 stayed.
Comparing those who churn to those who stay gives some interesting insights. This comparison considers the mean values for each cluster.
Cluster 0 is the only cluster in which more people live or work nearby for the portion that churned than those who stayed.
For provision of phone number, Cluster 0 customers who churned comprise the only boolean segment for which the mean is 1.
For working for a partner company, there are double the amount of Cluster 0 customers who stay than there are Cluster 0 customers who churn. For the other clusters, there are one and half times as many customers who stay as customers who churn.
For group attendance, for all clusters but one, there are double or one and half times as many people who stay as people who churn. No one that churns in Cluster 0 is part of group sessions.
For the friend promotion, there are twice as many people who stay as those who churn who have the friend promotion. The exception is Cluster 0, where more people who churn use the friend promotion,than those who do not.
Mean age is all within a narrow band. Everyone who churns is approximately three years older than those who stay.
For those who stay compared to those who churn, average lifetime is four to five times as high and contract length is triple the length.
Here are recommendations for customer interaction and retention.